Data will be updated weekly
Each new record is accumulated data from previous days
This is the dataset that describes Equipment Losses & Death Toll
& Military Wounded & Prisoner of War of russians in 2022 Ukraine
russia War.
Data:
russia_losses_equipment.csv - contains Equipment Losses during the
war
Data Sources:
Main data sources are General Staff of the Armed Forces of Ukraine
and Ministry of Defence of Ukraine. Data from different battlefields are
gathered. The calculation is complicated by the high intensity of
hostilities. Data are being updated. Â https://www.kaggle.com/datasets/piterfm/2022-ukraine-russian-war
Tracking:
Aircraft
Helicopter
Tank
Armored Personnel Carrier (APC)
Multiple Rocket Launcher (MRL)
Field Artillery
Military Auto - has not been tracked since 2022-05-01; joined with Fuel
Tank into Vehicles and Fuel Tanks
Fuel Tank - has not been tracked since 2022-05-01; joined with Military
Auto into Vehicles and Fuel Tanks
Anti-aircraft warfare
Drone - UAV+RPA
Naval Ship - Warships, Boats
Anti-aircraft Warfare
Mobile SRBM System - has not been tracked since 2022-05-01; joined into
Cruise Missiles
Vehicles and Fuel Tanks - appear since 2022-05-01 as a sum of Fuel Tank
and Military Auto
Cruise Missiles - appear since 2022-05-01
Direction of Greatest Losses - appear since 2022-04-25
Acronyms:
MRL - Multiple Rocket Launcher,
APC - Armored Personnel Carrier,
SRBM - Short-range ballistic missile,
UAV - Unmanned Aerial Vehicle,
RPA - Remotely Piloted Vehicle.
Dataset History:
2022-03-02 - dataset is created (after 7 days of the War).
library(TSstudio)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(forecast)
## Registered S3 method overwritten by 'quantmod':
## method from
## as.zoo.data.frame zoo
library(ggplot2)
library(ggfortify)
## Registered S3 methods overwritten by 'ggfortify':
## method from
## autoplot.Arima forecast
## autoplot.acf forecast
## autoplot.ar forecast
## autoplot.bats forecast
## autoplot.decomposed.ts forecast
## autoplot.ets forecast
## autoplot.forecast forecast
## autoplot.stl forecast
## autoplot.ts forecast
## fitted.ar forecast
## fortify.ts forecast
## residuals.ar forecast
library(MASS)
##
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
##
## select
library(tseries)
library(fUnitRoots)
library(pdR)
library(uroot)
library(TSA)
## Registered S3 methods overwritten by 'TSA':
## method from
## fitted.Arima forecast
## plot.Arima forecast
##
## Attaching package: 'TSA'
## The following objects are masked from 'package:stats':
##
## acf, arima
## The following object is masked from 'package:utils':
##
## tar
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ lubridate 1.9.3 ✔ tibble 3.2.1
## ✔ purrr 1.0.2 ✔ tidyr 1.3.0
## ✔ readr 2.1.4
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ✖ MASS::select() masks dplyr::select()
## ✖ readr::spec() masks TSA::spec()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(tibbletime)
##
## Attaching package: 'tibbletime'
##
## The following object is masked from 'package:stats':
##
## filter
library(anomalize)
library(pracma)
##
## Attaching package: 'pracma'
##
## The following object is masked from 'package:purrr':
##
## cross
library(gridExtra)
##
## Attaching package: 'gridExtra'
##
## The following object is masked from 'package:dplyr':
##
## combine
library(prophet)
## Zorunlu paket yükleniyor: Rcpp
## Zorunlu paket yükleniyor: rlang
##
## Attaching package: 'rlang'
##
## The following objects are masked from 'package:purrr':
##
## %@%, flatten, flatten_chr, flatten_dbl, flatten_int, flatten_lgl,
## flatten_raw, invoke, splice
library(zoo)
##
## Attaching package: 'zoo'
##
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
data=read.csv("C:/Users/ASUS/Desktop/time series project/2022 Russia Ukraine War.csv")
data1=data
data=data[,c("date","tank")]
data["date"]=as.Date(data$date,format="%Y-%m-%d")
data
data2<- data %>% dplyr::mutate(year = lubridate::year(date), month = lubridate::month(date))
data2$month<-as.factor(data2$month)
data2$year<-as.factor(data2$year)
data2
data3=read.zoo(data)
bp <- ggplot(data2, aes(x=month, y=tank, fill=month)) +
geom_boxplot()+
labs(title="Boxplot Across Months",x="Month", y = "tank")
bp
bp <- ggplot(data2, aes(x=year, y=tank, fill=year)) +
geom_boxplot()+
labs(title="Boxplot Across Years",x="year", y = "tank")
bp
ggplot(data2, aes(as.numeric(as.character(month)), tank)) + geom_line(aes(colour = year))+xlab("month")
acf(data3)
pacf(data3)
data
head(data3)
## 2022-02-25 2022-02-26 2022-02-27 2022-02-28 2022-03-01 2022-03-02
## 80 146 150 150 198 211
str(data3)
## 'zoo' series from 2022-02-25 to 2023-11-12
## Data: int [1:626] 80 146 150 150 198 211 217 251 269 285 ...
## Index: Date[1:626], format: "2022-02-25" "2022-02-26" "2022-02-27" "2022-02-28" "2022-03-01" ...
summary(data3)
## Index data3
## Min. :2022-02-25 Min. : 80
## 1st Qu.:2022-07-31 1st Qu.:1764
## Median :2023-01-03 Median :3037
## Mean :2023-01-03 Mean :2888
## 3rd Qu.:2023-06-08 3rd Qu.:3898
## Max. :2023-11-12 Max. :5349
autoplot(data3)
The plot shows us the data has trend and no seasoanlity. Hence, the
process is not stationary.
data
split=which(data$date=="2023-10-12")
train=data[0:split,]
test=data[(split+1):length(data$date),]
test2=test
traindf <- as_tbl_time(train,index = date)
class(traindf)
## [1] "tbl_time" "tbl_df" "tbl" "data.frame"
traindf %>%
time_decompose(tank, method = "stl", trend = "auto") %>%
anomalize(remainder, method = "gesd", alpha = 0.5) %>%
plot_anomaly_decomposition()
## frequency = 7 days
## trend = 91 days
traindf %>%
time_decompose(tank) %>%
anomalize(remainder, method = "gesd", alpha = 0.5) %>%
time_recompose() %>%
plot_anomalies(time_recomposed = TRUE, ncol = 3, alpha_dots = 0.5)
## frequency = 7 days
## trend = 91 days
traindf=traindf %>%
time_decompose(tank, method = "stl", trend = "auto") %>%
anomalize(remainder, method = "gesd", alpha = 0.5) %>%
filter(anomaly == 'No')
## frequency = 7 days
## trend = 91 days
train=merge(train["date"],traindf[,c("date","observed")],by="date",all.x = TRUE)
train["observed"]=tsclean(train$observed)
train["observed"]=as.numeric(train$observed)
colnames(train)=c("date","tank")
train=as_tbl_time(train,index=date)
test=as_tbl_time(test,index=date)
train2=train
train=read.zoo(train)
ts_plot(train)
b=boxcox(lm(train ~ 1))
b$x[which.max(b$y)]
## [1] 1.030303
Since lambda is close to 1, there is no need to transform data.
acf(train)
The ACF plot shows us there exists deterministic trend.
pacf(train)
Since the ACF plot shows us that the process is not stationary, no need
to look the PACF plot.
kpss.test(train, null="Level")
## Warning in kpss.test(train, null = "Level"): p-value smaller than printed
## p-value
##
## KPSS Test for Level Stationarity
##
## data: train
## KPSS Level = 8.4175, Truncation lag parameter = 6, p-value = 0.01
Since the p-value is less than 0.05, we reject the null hypothesis and conclude that the dataset is not stationary.
kpss.test(train, null="Trend")
## Warning in kpss.test(train, null = "Trend"): p-value smaller than printed
## p-value
##
## KPSS Test for Trend Stationarity
##
## data: train
## KPSS Trend = 1.8388, Truncation lag parameter = 6, p-value = 0.01
Since p-value is less than 0.05, the trend is stochastic.
mean(train)
## [1] 2766.877
Since the mean of the time series is not zero or close to zero, we need to apply ADF test.
adfTest(train, lags=1, type="c")
## Warning in adfTest(train, lags = 1, type = "c"): p-value smaller than printed
## p-value
##
## Title:
## Augmented Dickey-Fuller Test
##
## Test Results:
## PARAMETER:
## Lag Order: 1
## STATISTIC:
## Dickey-Fuller: -5.7602
## P VALUE:
## 0.01
##
## Description:
## Sun Jan 21 20:09:15 2024 by user: ASUS
Since p-value is less than 0.05, the process has no unit root.
There is contradictory between kpss test and adf test so we need to
check plots.
Since ACF plot has linear decay, we need to differentiate the
data.
pp.test(train)
##
## Phillips-Perron Unit Root Test
##
## data: train
## Dickey-Fuller Z(alpha) = -4.8213, Truncation lag parameter = 6, p-value
## = 0.8408
## alternative hypothesis: stationary
There is contradictory between kpss test & pp-test and adf test
so we need to check plots.
Since ACF plot has linear decay, we need to differentiate the
data.
difftank=diff(train)
acf(difftank)
pacf(difftank)
According to the ACF plot, the process still is not stationary.
kpss.test(difftank, null="Level")
## Warning in kpss.test(difftank, null = "Level"): p-value smaller than printed
## p-value
##
## KPSS Test for Level Stationarity
##
## data: difftank
## KPSS Level = 2.0185, Truncation lag parameter = 6, p-value = 0.01
Since the p-value is less than 0.05, we reject the null hypothesis and conclude that the differenced dataset is not stationary.
mean(difftank)
## [1] 7.883838
Since the mean of the time series is not zero or close to zero, we need to apply ADF test.
adfTest(difftank, lags=5, type="c")
## Warning in adfTest(difftank, lags = 5, type = "c"): p-value smaller than
## printed p-value
##
## Title:
## Augmented Dickey-Fuller Test
##
## Test Results:
## PARAMETER:
## Lag Order: 5
## STATISTIC:
## Dickey-Fuller: -5.5029
## P VALUE:
## 0.01
##
## Description:
## Sun Jan 21 20:09:15 2024 by user: ASUS
Since p-value is smaller than 0.05, the differenced process has no
unit root.
Hence, we need to differentiate the differenced process.
Hence, we need to differentiate the process.
pp.test(difftank)
## Warning in pp.test(difftank): p-value smaller than printed p-value
##
## Phillips-Perron Unit Root Test
##
## data: difftank
## Dickey-Fuller Z(alpha) = -554.04, Truncation lag parameter = 6, p-value
## = 0.01
## alternative hypothesis: stationary
diff2tank=diff(difftank)
acf(diff2tank)
pacf(diff2tank)
kpss.test(diff2tank, null="Level")
## Warning in kpss.test(diff2tank, null = "Level"): p-value greater than printed
## p-value
##
## KPSS Test for Level Stationarity
##
## data: diff2tank
## KPSS Level = 0.020304, Truncation lag parameter = 6, p-value = 0.1
Since the p-value is greater than 0.05, we fail to reject the null
hypothesis and conclude that the differenced dataset is
stationary.
mean(diff2tank)
## [1] 0
Since the mean of the time series is zero, we need to apply pp
test.
adfTest(diff2tank,lags =4,type = "nc" )
## Warning in adfTest(diff2tank, lags = 4, type = "nc"): p-value smaller than
## printed p-value
##
## Title:
## Augmented Dickey-Fuller Test
##
## Test Results:
## PARAMETER:
## Lag Order: 4
## STATISTIC:
## Dickey-Fuller: -16.7239
## P VALUE:
## 0.01
##
## Description:
## Sun Jan 21 20:09:16 2024 by user: ASUS
pp.test(diff2tank)
## Warning in pp.test(diff2tank): p-value smaller than printed p-value
##
## Phillips-Perron Unit Root Test
##
## data: diff2tank
## Dickey-Fuller Z(alpha) = -738.73, Truncation lag parameter = 6, p-value
## = 0.01
## alternative hypothesis: stationary
Since p-value is less than 0.05, the differenced process has no unit
root.
Hence, for model, d=2 (I(2)).
acf(diff2tank)
MA(4) or MA(1)
pacf(diff2tank)
AR(4)
eacf(diff2tank)
## AR/MA
## 0 1 2 3 4 5 6 7 8 9 10 11 12 13
## 0 x o o o x o o o o o o o o o
## 1 x x o o x o o o o o o o o o
## 2 x x x o x o o o o o o o o o
## 3 x x x x o o o o o o o o o o
## 4 x x o x x o o o o o o o o o
## 5 x x o x o o o o o o o o o o
## 6 x x x x o o o o o o o o o o
## 7 x x o x o o o x o o o o o o
According to ACF, PACF, and EACf, the possible models are
ARIMA(4,2,1) and ARIMA(4,2,4).
auto.arima(train)
## Series: train
## ARIMA(2,2,2)
##
## Coefficients:
## ar1 ar2 ma1 ma2
## 0.7691 0.0390 -1.5921 0.6035
## s.e. 0.1523 0.0584 0.1478 0.1388
##
## sigma^2 = 26.97: log likelihood = -1817.17
## AIC=3644.35 AICc=3644.45 BIC=3666.28
checkFit=Arima(train,order = c(4,2,1))
checkFit
## Series: train
## ARIMA(4,2,1)
##
## Coefficients:
## ar1 ar2 ar3 ar4 ma1
## 0.1364 0.1347 0.0266 0.0509 -0.9516
## s.e. 0.0467 0.0457 0.0450 0.0451 0.0208
##
## sigma^2 = 27.09: log likelihood = -1817.94
## AIC=3647.88 AICc=3648.02 BIC=3674.19
Since MA(1) is not 1 or close to 1, we did not get much
difference.
ar4=0.0509/0.0451
ma1=abs(-0.9516/0.0208)
ar4
## [1] 1.128603
ma1
## [1] 45.75
fit=Arima(train,order = c(3,2,1))
fit
## Series: train
## ARIMA(3,2,1)
##
## Coefficients:
## ar1 ar2 ar3 ma1
## 0.1276 0.1319 0.0243 -0.9419
## s.e. 0.0470 0.0463 0.0455 0.0217
##
## sigma^2 = 27.1: log likelihood = -1818.58
## AIC=3647.15 AICc=3647.26 BIC=3669.08
ar3=0.0243/0.0455
ma1=abs(-0.9419/0.0217)
ar3
## [1] 0.5340659
ma1
## [1] 43.40553
fit1=Arima(train,order = c(2,2,1))
fit1
## Series: train
## ARIMA(2,2,1)
##
## Coefficients:
## ar1 ar2 ma1
## 0.1252 0.1297 -0.9370
## s.e. 0.0474 0.0466 0.0212
##
## sigma^2 = 27.07: log likelihood = -1818.72
## AIC=3645.44 AICc=3645.51 BIC=3662.98
ar2=0.1297/0.0466
ma1=abs(-0.9370/0.0212)
ar2
## [1] 2.783262
ma1
## [1] 44.19811
Since the parameters are significant, fit1 can be used.
fit2=Arima(train,order = c(4,2,4))
fit2
## Series: train
## ARIMA(4,2,4)
##
## Coefficients:
## ar1 ar2 ar3 ar4 ma1 ma2 ma3 ma4
## -0.0955 -0.2698 0.7856 0.0431 -0.7213 0.2082 -1.0641 0.6089
## s.e. 0.1525 0.0989 0.1168 0.0582 0.1482 0.0158 0.0292 0.1395
##
## sigma^2 = 26.78: log likelihood = -1814.63
## AIC=3647.26 AICc=3647.56 BIC=3686.72
ar4=0.0431/0.0582
ma4=0.6089/0.1395
ar4
## [1] 0.7405498
ma4
## [1] 4.364875
fit3=Arima(train,order = c(4,2,3))
fit3
## Series: train
## ARIMA(4,2,3)
##
## Coefficients:
## ar1 ar2 ar3 ar4 ma1 ma2 ma3
## 0.1574 -0.8454 0.1250 0.1581 -0.972 1.0286 -0.9402
## s.e. 0.0478 0.0461 0.0476 0.0463 0.023 0.0174 0.0222
##
## sigma^2 = 26.49: log likelihood = -1812.4
## AIC=3640.8 AICc=3641.05 BIC=3675.88
ar4=0.1581/0.0463
ma3=abs(-0.9402/0.0222)
ar4
## [1] 3.414687
ma3
## [1] 42.35135
Since the parameters are significant, fit3 can be used.
fit4=Arima(train,order = c(3,2,4))
fit4
## Series: train
## ARIMA(3,2,4)
##
## Coefficients:
## ar1 ar2 ar3 ma1 ma2 ma3 ma4
## 0.2321 0.0154 0.4459 -1.0472 0.2096 -0.5496 0.4042
## s.e. 0.6129 0.4715 0.2347 0.5938 0.8854 0.5089 0.1803
##
## sigma^2 = 26.96: log likelihood = -1815.59
## AIC=3647.18 AICc=3647.43 BIC=3682.26
ar3=0.4459/0.2347
ma4=0.4042/0.1803
ar3
## [1] 1.899872
ma4
## [1] 2.241819
fit5=Arima(train,order = c(3,2,3))
fit5
## Series: train
## ARIMA(3,2,3)
##
## Coefficients:
## ar1 ar2 ar3 ma1 ma2 ma3
## 0.5009 0.3118 -0.0289 -1.3234 0.1270 0.2091
## s.e. 0.4497 0.3236 0.0635 0.4479 0.6814 0.2606
##
## sigma^2 = 27.03: log likelihood = -1816.86
## AIC=3647.72 AICc=3647.91 BIC=3678.42
ar3=abs(-0.0289/0.0635)
ma3=0.2091/0.2606
ar3
## [1] 0.4551181
ma3
## [1] 0.8023791
fit6=Arima(train,order = c(2,2,3))
fit6
## Series: train
## ARIMA(2,2,3)
##
## Coefficients:
## Warning in sqrt(diag(x$var.coef)): NaNs üretimi
## ar1 ar2 ma1 ma2 ma3
## 0.0777 0.6394 -0.8882 -0.5889 0.4933
## s.e. NaN NaN NaN NaN NaN
##
## sigma^2 = 27.03: log likelihood = -1817.37
## AIC=3646.74 AICc=3646.88 BIC=3673.05
fit7=Arima(train,order = c(2,2,2))
fit7
## Series: train
## ARIMA(2,2,2)
##
## Coefficients:
## ar1 ar2 ma1 ma2
## 0.7691 0.0390 -1.5921 0.6035
## s.e. 0.1523 0.0584 0.1478 0.1388
##
## sigma^2 = 26.97: log likelihood = -1817.17
## AIC=3644.35 AICc=3644.45 BIC=3666.28
ar2=0.0390/0.0584
ma2=0.6035/0.1388
ar2
## [1] 0.6678082
ma2
## [1] 4.347983
fit7=Arima(train,order = c(2,2,1))
fit7
## Series: train
## ARIMA(2,2,1)
##
## Coefficients:
## ar1 ar2 ma1
## 0.1252 0.1297 -0.9370
## s.e. 0.0474 0.0466 0.0212
##
## sigma^2 = 27.07: log likelihood = -1818.72
## AIC=3645.44 AICc=3645.51 BIC=3662.98
ar4=0.0929/0.0503
ma1=abs(-0.9512/0.0262)
ar4
## [1] 1.846918
ma1
## [1] 36.30534
fit8=Arima(train,order = c(2,2,4))
fit8
## Series: train
## ARIMA(2,2,4)
##
## Coefficients:
## ar1 ar2 ma1 ma2 ma3 ma4
## -1.2556 -0.4219 0.4488 -0.5833 -0.4429 -0.1575
## s.e. 0.2831 0.2760 0.2803 0.1059 0.2385 0.0426
##
## sigma^2 = 27.08: log likelihood = -1817.34
## AIC=3648.68 AICc=3648.87 BIC=3679.37
ma4=abs(-0.1575/0.0426)
ar2=abs(-0.4219/0.2760)
ar2
## [1] 1.528623
ma4
## [1] 3.697183
Since fit3 has lowest AIC, we choose it firstly.
fit9=Arima(train,order = c(4,1,5))
fit9
## Series: train
## ARIMA(4,1,5)
##
## Coefficients:
## ar1 ar2 ar3 ar4 ma1 ma2 ma3 ma4
## 1.3234 -0.9494 1.3522 -0.7272 -1.1291 0.9044 -1.3126 0.6042
## s.e. 0.1336 0.0812 0.0775 0.1268 0.1400 0.0767 0.0837 0.1243
## ma5
## -0.0206
## s.e. 0.0458
##
## sigma^2 = 26.82: log likelihood = -1817.26
## AIC=3654.52 AICc=3654.9 BIC=3698.39
ma4=abs(-0.7272/0.1268)
ar2=abs(-0.0206/0.0458)
ar2
## [1] 0.4497817
ma4
## [1] 5.735016
fit10=Arima(train,order = c(1,1,5))
fit10
## Series: train
## ARIMA(1,1,5)
##
## Coefficients:
## ar1 ma1 ma2 ma3 ma4 ma5
## 0.9989 -0.8178 0.0278 -0.1118 0.0234 -0.0292
## s.e. 0.0015 0.0431 0.0552 0.0598 0.0588 0.0449
##
## sigma^2 = 27.23: log likelihood = -1822.66
## AIC=3659.32 AICc=3659.51 BIC=3690.02
ma4=abs(0.9989/0.0015)
ar2=abs(-0.0292/0.0449)
ar2
## [1] 0.6503341
ma4
## [1] 665.9333
fit1=ARIMA(2,2,1) and fit3=ARIMA(4,2,3)
fit3
## Series: train
## ARIMA(4,2,3)
##
## Coefficients:
## ar1 ar2 ar3 ar4 ma1 ma2 ma3
## 0.1574 -0.8454 0.1250 0.1581 -0.972 1.0286 -0.9402
## s.e. 0.0478 0.0461 0.0476 0.0463 0.023 0.0174 0.0222
##
## sigma^2 = 26.49: log likelihood = -1812.4
## AIC=3640.8 AICc=3641.05 BIC=3675.88
r=resid(fit3)/sd(residuals(fit3))
head(r)
## Time Series:
## Start = 19048
## End = 19053
## Frequency = 1
## [1] 0.01277423 -0.03832256 2.58245119 1.49520387 0.76980521 -0.11041227
autoplot(r)+geom_line(y=0)+theme_minimal()+ggtitle("Plot of The Residuals")
Residuals are scattered around zero and it can be interpreted as zero mean
ggplot(r, aes(sample = r)) +stat_qq()+geom_qq_line()+ggtitle("QQ Plot of the Residuals")+theme_minimal()
## Don't know how to automatically pick scale for object of type <ts>. Defaulting
## to continuous.
## Don't know how to automatically pick scale for object of type <ts>. Defaulting
## to continuous.
QQ Plot shows that most of the residuals of the model does not lie on 45
degree straight line. This indicates residuals are not normally
distributed.
ggplot(r,aes(x=r))+geom_histogram(bins=20)+geom_density()+ggtitle("Histogram of Residuals")+theme_minimal()
## Don't know how to automatically pick scale for object of type <ts>. Defaulting
## to continuous.
Histogram of the resiudals shows that they do not have a symmetric
distribution.
checkresiduals(fit3)
##
## Ljung-Box test
##
## data: Residuals from ARIMA(4,2,3)
## Q* = 17.197, df = 3, p-value = 0.0006437
##
## Model df: 7. Total lags used: 10
Since p-value is less than 0.05, The model exhibit lack of fit.
g1<-ggAcf(as.vector(r),lag.max = 72)+theme_minimal()+ggtitle("ACF of Residuals")
g2<-ggPacf(as.vector(r),lag.max = 72)+theme_minimal()+ggtitle("PACF of Residuals") # homoscedasticity check
grid.arrange(g1,g2,ncol=2)
There exist significant spikes in the plots.
shapiro.test(r)
##
## Shapiro-Wilk normality test
##
## data: r
## W = 0.91643, p-value < 2.2e-16
/ Now, we cannot continue our analysis with ARIMA(4,2,3). Maybe we can continue fit1 which is ARIMA(2,2,1). Let us check dignostics of ARIMA(2,2,1): /
fit1
## Series: train
## ARIMA(2,2,1)
##
## Coefficients:
## ar1 ar2 ma1
## 0.1252 0.1297 -0.9370
## s.e. 0.0474 0.0466 0.0212
##
## sigma^2 = 27.07: log likelihood = -1818.72
## AIC=3645.44 AICc=3645.51 BIC=3662.98
r2=resid(fit1)/sd(residuals(fit1))
ggplot(r2, aes(sample = r2)) +stat_qq()+geom_qq_line()+ggtitle("QQ Plot of the Residuals")+theme_minimal()
## Don't know how to automatically pick scale for object of type <ts>. Defaulting
## to continuous.
## Don't know how to automatically pick scale for object of type <ts>. Defaulting
## to continuous.
QQ Plot shows that most of the residuals of the model does not lie on 45 degree straight line. This indicates residuals are not normally distributed.
autoplot(r2)+geom_line(y=0)+theme_minimal()+ggtitle("Plot of The Residuals")
Residuals are scattered around zero and it can be interpreted as zero mean
ggplot(r2, aes(sample = r2)) +stat_qq()+geom_qq_line()+ggtitle("QQ Plot of the Residuals")+theme_minimal()
## Don't know how to automatically pick scale for object of type <ts>. Defaulting
## to continuous.
## Don't know how to automatically pick scale for object of type <ts>. Defaulting
## to continuous.
QQ Plot shows that most of the residuals of the model does not lie on 45
degree straight line. This indicates residuals are not normally
distributed.
ggplot(r2,aes(x=r2))+geom_histogram(bins=20)+geom_density()+ggtitle("Histogram of Residuals")+theme_minimal()
## Don't know how to automatically pick scale for object of type <ts>. Defaulting
## to continuous.
Histogram of the resiudals shows that they do not have a symmetric
distribution.
checkresiduals(fit1)
##
## Ljung-Box test
##
## data: Residuals from ARIMA(2,2,1)
## Q* = 12.457, df = 7, p-value = 0.08649
##
## Model df: 3. Total lags used: 10
Since p-value is less than 0.05, The model exhibit lack of fit.
g1<-ggAcf(as.vector(r2),lag.max = 72)+theme_minimal()+ggtitle("ACF of Residuals")
g2<-ggPacf(as.vector(r2),lag.max = 72)+theme_minimal()+ggtitle("PACF of Residuals") # homoscedasticity check
grid.arrange(g1,g2,ncol=2)
There exist significant spikes in the plots.
g1<-ggAcf(as.vector(r2),lag.max = 72)+theme_minimal()+ggtitle("ACF of Squared Residuals")
g2<-ggPacf(as.vector(r2),lag.max = 72)+theme_minimal()+ggtitle("PACF of Squared Residuals") # homoscedasticity check
grid.arrange(g1,g2,ncol=2)
There exist significant spikes in the plots.
shapiro.test(r2)
##
## Shapiro-Wilk normality test
##
## data: r2
## W = 0.90164, p-value < 2.2e-16
/ Since p value is less than 0.05, we reject null hypothesis which is that the residuals has normality. /
m=lm(r~1+zlag(r))
library(lmtest)
bgtest(m, order = 15)
##
## Breusch-Godfrey test for serial correlation of order up to 15
##
## data: m
## LM test = 26.156, df = 15, p-value = 0.03641
Since p value is less than alpha, the residuals of the model are correlated, according to results of Breusch-Godfrey Test.
m2=lm(r2~1+zlag(r2))
bgtest(m2, order = 15)
##
## Breusch-Godfrey test for serial correlation of order up to 15
##
## data: m2
## LM test = 25.8, df = 15, p-value = 0.04018
Since p value is less than alpha, the residuals of the model are correlated, according to results of Breusch-Godfrey Test.
rr=r^2
g1<-ggAcf(as.vector(rr))+theme_minimal()+ggtitle("ACF of Squared Residuals")
g2<-ggPacf(as.vector(rr),lag.max = 72)+theme_minimal()+ggtitle("PACF of Squared Residuals") # homoscedasticity check
grid.arrange(g1,g2,ncol=2)
library(FinTS)
##
## Attaching package: 'FinTS'
## The following object is masked from 'package:forecast':
##
## Acf
ArchTest(r)
##
## ARCH LM-test; Null hypothesis: no ARCH effects
##
## data: r
## Chi-squared = 17.147, df = 12, p-value = 0.1441
There exist significant spikes in plots so there exist heteroscedasticity problem. Also, since p value is greater than 0.05, residuals exhibit no ARCH effects.
rr2=r2^2
g1<-ggAcf(as.vector(rr2),lag.max = 72)+theme_minimal()+ggtitle("ACF of Squared Residuals")
g2<-ggPacf(as.vector(rr2),lag.max = 72)+theme_minimal()+ggtitle("PACF of Squared Residuals") # homoscedasticity check
grid.arrange(g1,g2,ncol=2)
library(FinTS)
ArchTest(r2)
##
## ARCH LM-test; Null hypothesis: no ARCH effects
##
## data: r2
## Chi-squared = 13.382, df = 12, p-value = 0.3419
There exist significant spikes in plots so there exist
heteroscedasticity problem. Also, since p value is greater than 0.05,
residuals exhibit no ARCH effects.
Now, we can continue our analysis with fit1 which is ARIMA(2,2,1)
since it does not exhibit lack of fit.
set.seed(497)
test=read.zoo(test)
f=forecast(fit1,h=length(test))
summary(f)
##
## Forecast method: ARIMA(2,2,1)
##
## Model Information:
## Series: train
## ARIMA(2,2,1)
##
## Coefficients:
## ar1 ar2 ma1
## 0.1252 0.1297 -0.9370
## s.e. 0.0474 0.0466 0.0212
##
## sigma^2 = 27.07: log likelihood = -1818.72
## AIC=3645.44 AICc=3645.51 BIC=3662.98
##
## Error measures:
## ME RMSE MAE MPE MAPE MASE
## Training set -0.05560763 5.180923 3.655655 0.0164543 0.2503019 0.463294
## ACF1
## Training set 0.005892095
##
## Forecasts:
## Point Forecast Lo 80 Hi 80 Lo 95 Hi 95
## 19643 4835.454 4828.787 4842.122 4825.257 4845.652
## 19644 4842.717 4832.362 4853.072 4826.880 4858.553
## 19645 4850.917 4836.955 4864.878 4829.565 4872.269
## 19646 4859.339 4842.091 4876.587 4832.960 4885.718
## 19647 4867.911 4847.544 4888.277 4836.762 4899.059
## 19648 4876.530 4853.171 4899.889 4840.805 4912.254
## 19649 4885.174 4858.901 4911.447 4844.993 4925.355
## 19650 4893.828 4864.691 4922.965 4849.266 4938.390
## 19651 4902.486 4870.512 4934.461 4853.586 4951.387
## 19652 4911.146 4876.348 4945.945 4857.927 4964.366
## 19653 4919.807 4882.186 4957.428 4862.271 4977.344
## 19654 4928.468 4888.018 4968.919 4866.605 4990.332
## 19655 4937.130 4893.838 4980.421 4870.921 5003.339
## 19656 4945.791 4899.640 4991.941 4875.210 5016.372
## 19657 4954.452 4905.423 5003.482 4879.468 5029.437
## 19658 4963.114 4911.181 5015.046 4883.690 5042.537
## 19659 4971.775 4916.915 5026.635 4887.874 5055.676
## 19660 4980.437 4922.621 5038.252 4892.016 5068.857
## 19661 4989.098 4928.300 5049.896 4896.115 5082.081
## 19662 4997.759 4933.949 5061.570 4900.169 5095.350
## 19663 5006.421 4939.567 5073.274 4904.177 5108.664
## 19664 5015.082 4945.155 5085.009 4908.138 5122.026
## 19665 5023.744 4950.712 5096.775 4912.051 5135.436
## 19666 5032.405 4956.237 5108.573 4915.916 5148.894
## 19667 5041.066 4961.730 5120.402 4919.733 5162.400
## 19668 5049.728 4967.192 5132.264 4923.500 5175.956
## 19669 5058.389 4972.621 5144.157 4927.218 5189.560
## 19670 5067.051 4978.018 5156.083 4930.887 5203.214
## 19671 5075.712 4983.382 5168.041 4934.506 5216.918
## 19672 5084.373 4988.715 5180.032 4938.076 5230.670
## 19673 5093.035 4994.015 5192.054 4941.597 5244.472
ets=ets(train,model = "ZZZ")
etsforc=forecast(ets,h=length(test))
summary(ets)
## ETS(A,A,N)
##
## Call:
## ets(y = train, model = "ZZZ")
##
## Smoothing parameters:
## alpha = 0.9999
## beta = 0.1456
##
## Initial states:
## l = 124.1091
## b = 13.1785
##
## sigma: 5.2568
##
## AIC AICc BIC
## 5782.002 5782.104 5803.945
##
## Training set error measures:
## ME RMSE MAE MPE MAPE MASE
## Training set -0.0549773 5.239054 3.688071 -0.001526009 0.2651306 0.4674022
## ACF1
## Training set 0.03969414
r=etsforc$residuals
ggplot(r, aes(sample = r)) +stat_qq()+geom_qq_line()+ggtitle("QQ Plot of the Residuals")+theme_minimal()
## Don't know how to automatically pick scale for object of type <ts>. Defaulting
## to continuous.
## Don't know how to automatically pick scale for object of type <ts>. Defaulting
## to continuous.
autoplot(r)+geom_line(y=0)+theme_minimal()+ggtitle("Plot of The Residuals")
ggplot(r,aes(x=r))+geom_histogram(bins=20)+geom_density()+ggtitle("Histogram of Residuals")+theme_minimal()
## Don't know how to automatically pick scale for object of type <ts>. Defaulting
## to continuous.
shapiro.test(r)
##
## Shapiro-Wilk normality test
##
## data: r
## W = 0.91062, p-value < 2.2e-16
nnetar.m=nnetar(train)
nnetar_forecast<-forecast(nnetar.m,h=length(test))
nnetar.m$method
## [1] "NNAR(1,1)"
nnetar.m$call
## nnetar(y = train)
nnetar.m$model
##
## Average of 20 networks, each of which is
## a 1-1-1 network with 4 weights
## options were - linear output units
r=nnetar.m$residuals
ggplot(r, aes(sample = r)) +stat_qq()+geom_qq_line()+ggtitle("QQ Plot of the Residuals")+theme_minimal()
## Don't know how to automatically pick scale for object of type <ts>. Defaulting
## to continuous.
## Don't know how to automatically pick scale for object of type <ts>. Defaulting
## to continuous.
## Warning: Removed 1 rows containing non-finite values (`stat_qq()`).
## Warning: Removed 1 rows containing non-finite values (`stat_qq_line()`).
autoplot(r)+geom_line(y=0)+theme_minimal()+ggtitle("Plot of The Residuals")
## Warning: Removed 1 row containing missing values (`geom_line()`).
ggplot(r,aes(x=r))+geom_histogram(bins=20)+geom_density()+ggtitle("Histogram of Residuals")+theme_minimal()
## Don't know how to automatically pick scale for object of type <ts>. Defaulting
## to continuous.
## Warning: Removed 1 rows containing non-finite values (`stat_bin()`).
## Warning: Removed 1 rows containing non-finite values (`stat_density()`).
shapiro.test(r)
##
## Shapiro-Wilk normality test
##
## data: r
## W = 0.97479, p-value = 1.391e-08
tbatsmodel<-tbats(ts(train))
tbats_forecast<-forecast(tbatsmodel,h=length(test))
tbatsmodel
## BATS(1, {0,0}, 0.995, -)
##
## Call: tbats(y = ts(train))
##
## Parameters
## Alpha: 1.051277
## Beta: 0.122806
## Damping Parameter: 0.994958
##
## Seed States:
## [,1]
## [1,] 125.04875
## [2,] 15.85002
##
## Sigma: 5.21842
## AIC: 5777.306
r=ts(tbats_forecast$residuals)
ggplot(r, aes(sample = r)) +stat_qq()+geom_qq_line()+ggtitle("QQ Plot of the Residuals")+theme_minimal()
## Don't know how to automatically pick scale for object of type <ts>. Defaulting
## to continuous.
## Don't know how to automatically pick scale for object of type <ts>. Defaulting
## to continuous.
autoplot(r)+geom_line(y=0)+theme_minimal()+ggtitle("Plot of The Residuals")
ggplot(r,aes(x=r))+geom_histogram(bins=20)+geom_density()+ggtitle("Histogram of Residuals")+theme_minimal()
## Don't know how to automatically pick scale for object of type <ts>. Defaulting
## to continuous.
shapiro.test(r)
##
## Shapiro-Wilk normality test
##
## data: r
## W = 0.90448, p-value < 2.2e-16
changepoint_prior_scale=c(0.001, 0.01, 0.1, 0.5)
changepoint_range=c(0.8,0.85,0.90, 0.95)
colnames(train2)=c("ds","y")
for(i in changepoint_prior_scale){
for(j in changepoint_range){
prophet.m=prophet(train2,daily.seasonality = FALSE,yearly.seasonality = FALSE,changepoint.prior.scale =i ,changepoint.range =j )
future=make_future_dataframe(prophet.m, periods = 31)
prophet_forecast=predict(prophet.m,future)
print(i)
print(j)
k=data.frame(accuracy(prophet_forecast$yhat,test))
rownames(k)=paste(as.character(i),as.character(j),sep = "-")
print(k)
}}
## [1] 0.001
## [1] 0.8
## ME RMSE MAE MPE MAPE
## 0.001-0.8 4534.172 4534.569 4534.172 88.00939 88.00939
## [1] 0.001
## [1] 0.85
## ME RMSE MAE MPE MAPE
## 0.001-0.85 4557.406 4557.785 4557.406 88.46109 88.46109
## [1] 0.001
## [1] 0.9
## ME RMSE MAE MPE MAPE
## 0.001-0.9 4531.847 4532.249 4531.847 87.96403 87.96403
## [1] 0.001
## [1] 0.95
## ME RMSE MAE MPE MAPE
## 0.001-0.95 4595.92 4596.266 4595.92 89.21016 89.21016
## [1] 0.01
## [1] 0.8
## ME RMSE MAE MPE MAPE
## 0.01-0.8 4794.007 4794.066 4794.007 93.07424 93.07424
## [1] 0.01
## [1] 0.85
## ME RMSE MAE MPE MAPE
## 0.01-0.85 4794.718 4794.776 4794.718 93.08809 93.08809
## [1] 0.01
## [1] 0.9
## ME RMSE MAE MPE MAPE
## 0.01-0.9 4795.329 4795.371 4795.329 93.10193 93.10193
## [1] 0.01
## [1] 0.95
## ME RMSE MAE MPE MAPE
## 0.01-0.95 4794.353 4794.397 4794.353 93.08295 93.08295
## [1] 0.1
## [1] 0.8
## ME RMSE MAE MPE MAPE
## 0.1-0.8 4792.21 4792.223 4792.21 93.04694 93.04694
## [1] 0.1
## [1] 0.85
## ME RMSE MAE MPE MAPE
## 0.1-0.85 4792.386 4792.397 4792.386 93.05119 93.05119
## [1] 0.1
## [1] 0.9
## ME RMSE MAE MPE MAPE
## 0.1-0.9 4791.264 4791.274 4791.264 93.02981 93.02981
## [1] 0.1
## [1] 0.95
## ME RMSE MAE MPE MAPE
## 0.1-0.95 4790.621 4790.633 4790.621 93.01801 93.01801
## [1] 0.5
## [1] 0.8
## ME RMSE MAE MPE MAPE
## 0.5-0.8 4792.126 4792.139 4792.126 93.04606 93.04606
## [1] 0.5
## [1] 0.85
## ME RMSE MAE MPE MAPE
## 0.5-0.85 4791.652 4791.663 4791.652 93.03733 93.03733
## [1] 0.5
## [1] 0.9
## ME RMSE MAE MPE MAPE
## 0.5-0.9 4790.899 4790.908 4790.899 93.02324 93.02324
## [1] 0.5
## [1] 0.95
## ME RMSE MAE MPE MAPE
## 0.5-0.95 4789.96 4789.972 4789.96 93.00599 93.00599
prophet.m=prophet(train2,daily.seasonality = FALSE,yearly.seasonality = FALSE,changepoint.prior.scale =0.001,changepoint.range =0.9 )
future=make_future_dataframe(prophet.m, periods = length(test))
prophet_forecast=predict(prophet.m,future)
prophet.m$fit.kwargs
## list()
r=ts(prophet_forecast$yhat-data$tank)
checkresiduals(r)
##
## Ljung-Box test
##
## data: Residuals
## Q* = 5141.7, df = 10, p-value < 2.2e-16
##
## Model df: 0. Total lags used: 10
Since p-value is greater than 0.05, The model does not exhibit lack
of fit.
ggplot(r, aes(sample = r)) +stat_qq()+geom_qq_line()+ggtitle("QQ Plot of the Residuals")+theme_minimal()
## Don't know how to automatically pick scale for object of type <ts>. Defaulting
## to continuous.
## Don't know how to automatically pick scale for object of type <ts>. Defaulting
## to continuous.
autoplot(r)+geom_line(y=0)+theme_minimal()+ggtitle("Plot of The Residuals")
ggplot(r,aes(x=r))+geom_histogram(bins=20)+geom_density()+ggtitle("Histogram of Residuals")+theme_minimal()
## Don't know how to automatically pick scale for object of type <ts>. Defaulting
## to continuous.
shapiro.test(r)
##
## Shapiro-Wilk normality test
##
## data: r
## W = 0.98078, p-value = 2.523e-07
print("ARIMA(2,2,1)")
## [1] "ARIMA(2,2,1)"
accuracy(f,test)
## ME RMSE MAE MPE MAPE MASE
## Training set -0.05560763 5.180923 3.655655 0.0164543 0.2503019 0.463294
## Test set 190.34392895 198.410056 190.343929 3.6681379 3.6681379 24.122956
## ACF1
## Training set 0.005892095
## Test set NA
print("ETS")
## [1] "ETS"
accuracy(etsforc,test)
## ME RMSE MAE MPE MAPE
## Training set -0.0549773 5.239054 3.688071 -0.001526009 0.2651306
## Test set 189.9388230 198.572654 189.938823 3.659281250 3.6592812
## MASE ACF1
## Training set 0.4674022 0.03969414
## Test set 24.0716153 NA
print("NNETAR")
## [1] "NNETAR"
accuracy(nnetar_forecast,test)
## ME RMSE MAE MPE MAPE MASE
## Training set 0.2493625 9.524709 7.468734 -0.2165987 0.585039 0.946539
## Test set 480.2073608 520.639393 480.207361 9.2234421 9.223442 60.858368
## ACF1
## Training set 0.7388282
## Test set NA
print("TBATS")
## [1] "TBATS"
accuracy(tbats_forecast,test)
## ME RMSE MAE MPE MAPE MASE
## Training set 0.213985 5.21842 3.609074 0.002092469 0.2553715 0.4573906
## Test set 198.887362 208.98148 198.887362 3.829788413 3.8297884 25.2056952
## ACF1
## Training set 0.003401719
## Test set NA
print("Prophet")
## [1] "Prophet"
accuracy(prophet_forecast$yhat,test)
## ME RMSE MAE MPE MAPE
## Test set 4531.847 4532.249 4531.847 87.96403 87.96403
The best model is Holt’s method and the worst model is Prophet.
data_new=data
Series=ts(data_new$tank,start = 19048,end =19547 )
autoplot(f,series="ARIMA",color="blue")+autolayer(Series)+autolayer(fitted(fit1),color = "red", series="Fitted",linetype="dashed") + geom_vline(xintercept =19643 )+
ggplot2::ggtitle("ARIMA")+theme_minimal()+theme(legend.position="bottom")
plot(etsforc)
autoplot(etsforc,series="ETS",color="red")+autolayer(Series)+autolayer(fitted(etsforc),series="Fitted",linetype="dashed") + geom_vline(xintercept =19643 )+
ggplot2::ggtitle("ETS")+theme_minimal()+theme(legend.position="bottom")
plot(nnetar_forecast)
autoplot(Series,main="nnetar_forecast")+autolayer(nnetar_forecast) +autolayer(fitted(nnetar_forecast), series="Fitted",linetype="dashed")+ geom_vline(xintercept =19643 )+
ggplot2::ggtitle("NNETAR") +theme_minimal()+theme(legend.position="bottom")
## Warning: Removed 1 row containing missing values (`geom_line()`).
Series2=ts(data_new$tank,start =1 )
plot(tbats_forecast)
autoplot(Series2,series="data",color="red")+autolayer(tbats_forecast,series="Forecast")+autolayer(fitted(tbats_forecast),series="Fitted",linetype="dashed") + geom_vline(xintercept =595 )+
ggplot2::ggtitle("TBATS")+theme_minimal()+theme(legend.position="bottom")
plot(prophet.m,prophet_forecast)